library(keras)
library(magick)
library(parallel)
library(doParallel)
library(abind)
library(reticulate)
library(plyr)
這次實作資料是kaggle去年的競賽 Carvana Image Masking Challenge ,Carvana是一家銷售二手車的公司,拍攝許多高質感的汽車照片,不過在現有的自動去背景方法上,常因反光或背景顏色相似造成誤判,因此希望開發出更有效的方法。這個需求可以結合語意分割技術來應用,從網站下載資料,訓練圖像、mask有5088筆,測試圖像有100064筆。R在處理一般影像,有幾種套件可以使用(ex:magick、EBImage、imager、 OpenImageR…等),這次實作使用的是magick
首先,利用image_read方法載入一張圖來測試,image_background方法把背景設成黑色,image_info查看剛剛載入的圖像資訊
#檔名
jpg_names <- list.files('data/ImageMasking/train', full.names = T)
mask_names <- list.files('data/ImageMasking/train_masks', full.names = T)
#讀檔, 背景設為黑色
jpg <- image_background(image_read(jpg_names[1]), 'black')
mask <- image_background(image_read(mask_names[1]), 'black')
image_info(jpg)## format width height colorspace matte filesize density
## 1 JPEG 1918 1280 sRGB FALSE 0 72x72
image_info(mask)## format width height colorspace matte filesize density
## 1 GIF 1918 1280 sRGB FALSE 0 72x72
為使用資料擴增(data augmentation),自訂以下函式進行影像轉置處理
#旋轉,放大,裁切
rotate_image <- function(jpg, mask, left_lim = -20, right_lim = 20) {
if (rnorm(1) < 0) return(list(jpg = jpg, mask = mask))
degree <- runif(1, left_lim, right_lim)
jpg <- image_rotate(jpg, degree)
mask <- image_rotate(mask, degree)
original_w <-image_info(jpg)$width
original_h <- image_info(jpg)$height
zoom <- round((abs(degree)+100)/100 , 2)
print(zoom)
zoom_w <- round(original_w * zoom, 0)
zoom_h <- round(original_h * zoom, 0)
jpg <- image_sample(jpg, paste0(zoom_w, 'x', zoom_h, '!'))
mask <- image_sample(mask, paste0(zoom_w, 'x', zoom_h, '!'))
drop_w <- round((zoom_w - original_w) / 2, 0)
drop_h <- round((zoom_h - original_h) / 2, 0)
jpg <- image_crop(jpg, paste0(original_w, 'x', original_h, '+', drop_w, '+', drop_h))
mask <- image_crop(mask, paste0(original_w, 'x', original_h, '+', drop_w, '+', drop_h))
return(list(jpg = jpg, mask = mask))
}
#水平翻轉
horizontal_image <- function(jpg, mask){
if (rnorm(1) < 0) return(list(jpg = jpg, mask = mask))
jpg <- image_flop(jpg)
mask <- image_flop(mask)
return(list(jpg = jpg, mask = mask))
}
#亮度,飽和,色相
modulate_image <- function(img,
brightness_lim = c(100, 140),
saturation_lim = c(90, 130),
hue_lim = c(80, 120)) {
if (rnorm(1) < 0) return(img)
b_shift <- runif(1, brightness_lim[1], brightness_lim[2])
s_shift <- runif(1, saturation_lim[1], saturation_lim[2])
h_shift <- runif(1, hue_lim[1], hue_lim[2])
img <- image_modulate(img, brightness = b_shift, saturation = s_shift, hue = h_shift)
return(img)
}繪圖函式
#繪圖
plot_img <- function(obj, title = NA) {
obj_w <- NULL
obj_h <- NULL
obj_img <- obj
if(class(obj)=="magick-image"){
obj_w <-image_info(obj)$width
obj_h <- image_info(obj)$height
}
#array
if(length(dim(obj))==4){
obj_w <-dim(obj)[3]
obj_h <- dim(obj)[2]
obj_img <- obj[1,,,]
}
if(length(dim(obj))==3 || length(dim(obj))==2){
obj_w <-dim(obj)[2]
obj_h <- dim(obj)[1]
}
print(obj_w)
print(obj_h)
par(mar = rep(0.35, 4))
plot(c(0, obj_w), c(0, obj_h), type = "n", xlab = "", ylab = "", axes = F, main = title)
rasterImage(obj_img, 0, 0, obj_w , obj_h)
}
#旋轉,放大,裁切
img_pair <- rotate_image(jpg, mask)
plot_img(img_pair$jpg)
plot_img(img_pair$mask)應用rotate_image函式,繪出測試圖
繪出合成圖
#合成
img_tmp <- image_composite(img_pair$jpg, img_pair$mask, operator = "blend", compose_args = "60")
plot_img(img_tmp)為了後續model的input格式,透過以下函式將images轉成array
#image轉成array
img_array <- function(img, h = 224 , w = 224) {
img <- image_sample(img, paste0(w, 'x', h, '!'))
arr <- array(data = as.numeric(img[[1]]), dim = c(1, h, w, 3))
return(arr)
}
mask_array <- function(img, h =224, w =224) {
img <- image_sample(img, paste0(w, 'x', h, '!'))
arr <- array(data = as.numeric(img[[1]]), dim = c(1, h, w, 1))
return(arr)
}競賽中所要求的評估是採dice係數,根據公式自訂loss函式如下
# dice metric
K <- backend()
dice <- function(y_true, y_pred, smooth = 1.0) {
y_true_f <- K$flatten(y_true)
y_pred_f <- K$flatten(y_pred)
intersection <- K$sum(y_true_f * y_pred_f)
result <- (2 * intersection + smooth) /
(K$sum(y_true_f) + K$sum(y_pred_f) + smooth)
return(result)
}
dice_bc_loss <- function(y_true, y_pred) {
result <- loss_binary_crossentropy(y_true, y_pred) + (1 - dice(y_true, y_pred))
return(result)
}自訂以下model,以resnet50為base model,受到UNet的啟發,因此在172、140、130…等層擷取其輸出層,再透過上採樣、合併的手法,最後輸出224,224,1大小的維度
set_ResNet50 <- function(input_shape = c(224, 224, 3), num_classes = 1) {
inputs <- layer_input(shape = input_shape)
base_model <- application_resnet50(weights = 'imagenet', include_top = FALSE, input_tensor = inputs)
base_model$layers <- base_model$layers[1:172]
op_7a <- base_model$layers[[172]]$output
#7 7 2048
op_14a <- base_model$layers[[140]]$output
op_14b <- base_model$layers[[130]]$output
#14 14 1024
op_28a <- base_model$layers[[78]]$output
op_28b <- base_model$layers[[68]]$output
#28 28 512
op_56a <- base_model$layers[[36]]$output %>%
layer_zero_padding_2d(padding = list(c(1, 0), c(1, 0)))
op_56b <- base_model$layers[[26]]$output %>%
layer_zero_padding_2d(padding = list(c(1, 0), c(1, 0)))
#55 55 256 -> 56 56 256
op_112a <- base_model$layers[[3]]$output
#112 112 64
up_1 <- op_7a %>%
#7 7 2048
layer_batch_normalization() %>%
layer_upsampling_2d(size = c(2, 2)) %>%
#14 14
{layer_concatenate(inputs = list(op_14a, op_14b, .), axis = 3)} %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 1024, kernel_size = c(3,3), padding = "same", activation = 'relu') %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 1024, kernel_size = c(3,3), padding = "same", activation = 'relu')
up_2 <- up_1 %>%
#14 14 1024
layer_batch_normalization() %>%
layer_upsampling_2d(size = c(2, 2)) %>%
#28 28
{layer_concatenate(inputs = list(op_28a, op_28b, .), axis = 3)} %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 512, kernel_size = c(3,3), padding = "same", activation = 'relu') %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 512, kernel_size = c(3,3), padding = "same", activation = 'relu')
up_3 <- up_2 %>%
#28 28 512
layer_batch_normalization() %>%
layer_conv_2d_transpose(filters = 512, kernel_size = c(4, 4), padding = "same", strides = c(2, 2), use_bias = FALSE, activation = 'relu') %>%
#56 56 512
{layer_concatenate(inputs = list(op_56a, op_56b, .), axis = 3)} %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 256, kernel_size = c(3,3), padding = "same", activation = 'relu') %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 256, kernel_size = c(3,3), padding = "same", activation = 'relu')
up_4 <- up_3 %>%
#56 56 256
layer_batch_normalization() %>%
layer_conv_2d_transpose(filters = 128, kernel_size = c(4, 4), padding = "same", strides = c(2, 2), use_bias = FALSE, activation = 'relu') %>%
#112 112 128
{layer_concatenate(inputs = list(op_112a, .), axis = 3)} %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 64, kernel_size = c(3,3), padding = "same", activation = 'relu') %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 64, kernel_size = c(3,3), padding = "same", activation = 'relu')
up_5 <- up_4 %>%
#112 112 64
layer_batch_normalization() %>%
layer_conv_2d_transpose(filters = 64, kernel_size = c(4, 4), padding = "same", strides = c(2, 2), use_bias = FALSE, activation = 'relu') %>%
#224 224 64
layer_batch_normalization() %>%
layer_conv_2d(filters = 32, kernel_size = c(3,3), padding = "same", activation = 'relu') %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = 32, kernel_size = c(3,3), padding = "same", activation = 'relu') %>%
layer_batch_normalization() %>%
layer_conv_2d(filters = num_classes, kernel_size = c(1, 1), activation = 'sigmoid')
model <- keras_model(inputs = base_model$input, outputs = up_5)
model <- multi_gpu_model(model, gpus = 2)
model %>% compile(
optimizer = optimizer_rmsprop(lr = 0.0001),
loss = dice_bc_loss,
metrics = c(dice = dice, 'accuracy')
)
return(model )
}
#建立model
model <- set_ResNet50()由於影像大小和記憶體的限制,加上使用data augmentation,資料會膨脹很多,存放在記憶體不可行,因此,建立generator函式是必要的,batch_size大小的設定可依gpu接受的buffer大小來調整
#generator
train_generator <- function(jpg_names, mask_names, batch_size) {
jpg_fullnames <- jpg_names
jpg_fullnames_all <- jpg_names
mask_fullnames <- mask_names
mask_fullnames_all <- mask_names
function() {
# start new epoch, reset
if (length(jpg_fullnames) < batch_size) {
jpg_fullnames <<- jpg_fullnames_all
mask_fullnames <<- mask_fullnames_all
}
batch_index <- sample(1:length(jpg_fullnames), batch_size)
batch_jpg <- jpg_fullnames[batch_index]
jpg_fullnames <<- jpg_fullnames[-batch_index]
batch_mask <- mask_fullnames[batch_index]
mask_fullnames <<- mask_fullnames[-batch_index]
jpg_mask_batch <- foreach(i = 1:batch_size) %dopar% {
# read img, set background color
img_jpg <- image_background(image_read(batch_jpg[i]), 'black')
img_mask <- image_background(image_read(batch_mask[i]), 'black')
# data augmentation
img_jpg <- modulate_image(img_jpg)
img_pair <- horizontal_image(img_jpg, img_mask)
img_pair <- rotate_image(img_pair$jpg, img_pair$mask)
jpg_mask_arr <- list(j = img_array(img_pair$jpg), p = mask_array(img_pair$mask))
}
jpg_mask_batch <- purrr::transpose(jpg_mask_batch)
jpg_batch <- do.call(abind, c(jpg_mask_batch$j, list(along = 1)))
mask_batch <- do.call(abind, c(jpg_mask_batch$p, list(along = 1)))
result <- list(keras_array(jpg_batch), keras_array(mask_batch))
return(result)
}
}
test_generator <- function(jpg_names, mask_names, batch_size) {
jpg_fullnames <- jpg_names
jpg_fullnames_all <- jpg_names
mask_fullnames <- mask_names
mask_fullnames_all <- mask_names
function() {
# start new epoch, reset
if (length(jpg_fullnames) < batch_size) {
jpg_fullnames <<- jpg_fullnames_all
mask_fullnames <<- mask_fullnames_all
}
batch_index <- sample(1:length(jpg_fullnames), batch_size)
batch_jpg <- jpg_fullnames[batch_index]
jpg_fullnames <<- jpg_fullnames[-batch_index]
batch_mask <- mask_fullnames[batch_index]
mask_fullnames <<- mask_fullnames[-batch_index]
jpg_mask_batch <- foreach(i = 1:batch_size) %dopar% {
# read img, set background color
img_jpg <- image_background( image_read(batch_jpg[i]), 'black')
img_mask <- image_background(image_read(batch_mask[i]), 'black')
jpg_mask_arr <- list(j = img_array(img_jpg), p = mask_array(img_mask))
}
jpg_mask_batch <- purrr::transpose(jpg_mask_batch)
jpg_batch <- do.call(abind, c(jpg_mask_batch$j, list(along = 1)))
mask_batch <- do.call(abind, c(jpg_mask_batch$p, list(along = 1)))
result <- list(keras_array(jpg_batch), keras_array(mask_batch))
return(result)
}
}這次使用的參數設定如下,其中py_iterator方法能建立如Python iterator來使用R的函式(generator),如此一來就能疊代批量餵圖訓練
batch_size <- 24
epochs <- 100
set.seed(777)
index <- sample(length(jpg_names), 0.8 * length(jpg_names))
#train
train_jpg <- jpg_names[index]
train_mask <- mask_names[index]
#test
test_jpg <- jpg_names[-index]
test_mask <- mask_names[-index]
#iterator
train_iterator <- py_iterator(train_generator(train_jpg, train_mask, batch_size))
test_iterator <- py_iterator(test_generator(test_jpg, test_mask, batch_size))啟用4核平行運算,將先前的方法設定至運算環境
# doParallel
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
clusterEvalQ(cl, {
library(magick)
library(abind)
library(reticulate)
#旋轉,放大,裁切
rotate_image <- function(jpg, mask, left_lim = -20, right_lim = 20) {
if (rnorm(1) < 0) return(list(jpg = jpg, mask = mask))
degree <- runif(1, left_lim, right_lim)
jpg <- image_rotate(jpg, degree)
mask <- image_rotate(mask, degree)
original_w <-image_info(jpg)$width
original_h <- image_info(jpg)$height
zoom <- round((abs(degree)+100)/100 , 2)
print(zoom)
zoom_w <- round(original_w * zoom, 0)
zoom_h <- round(original_h * zoom, 0)
jpg <- image_sample(jpg, paste0(zoom_w, 'x', zoom_h, '!'))
mask <- image_sample(mask, paste0(zoom_w, 'x', zoom_h, '!'))
drop_w <- round((zoom_w - original_w) / 2, 0)
drop_h <- round((zoom_h - original_h) / 2, 0)
jpg <- image_crop(jpg, paste0(original_w, 'x', original_h, '+', drop_w, '+', drop_h))
mask <- image_crop(mask, paste0(original_w, 'x', original_h, '+', drop_w, '+', drop_h))
return(list(jpg = jpg, mask = mask))
}
#水平翻轉
horizontal_image <- function(jpg, mask){
if (rnorm(1) < 0) return(list(jpg = jpg, mask = mask))
jpg <- image_flop(jpg)
mask <- image_flop(mask)
return(list(jpg = jpg, mask = mask))
}
#亮度,飽和,色相
modulate_image <- function(img,
brightness_lim = c(100, 140),
saturation_lim = c(90, 130),
hue_lim = c(80, 120)) {
if (rnorm(1) < 0) return(img)
b_shift <- runif(1, brightness_lim[1], brightness_lim[2])
s_shift <- runif(1, saturation_lim[1], saturation_lim[2])
h_shift <- runif(1, hue_lim[1], hue_lim[2])
img <- image_modulate(img, brightness = b_shift, saturation = s_shift, hue = h_shift)
return(img)
}
#image轉成array
img_array <- function(img, h = 224 , w = 224) {
img <- image_sample(img, paste0(w, 'x', h, '!'))
arr <- array(data = as.numeric(img[[1]]), dim = c(1, h, w, 3))
return(arr)
}
mask_array <- function(img, h =224, w =224) {
img <- image_sample(img, paste0(w, 'x', h, '!'))
arr <- array(data = as.numeric(img[[1]]), dim = c(1, h, w, 1))
return(arr)
}
})設定使用tensorboard,callbacks_list中自訂提前終止條件、高原期減少學習率及檢查點設定
#tensorboard設定
tensorboard("data/ImageMasking/logs_r")
#callback設定
callbacks_list <- list(
callback_tensorboard(log_dir = "data/ImageMasking/logs_r", batch_size = batch_size),
callback_early_stopping(monitor = "val_dice",
min_delta = 0.0001,
patience = 6,
verbose = 1,
mode = "max"),
callback_reduce_lr_on_plateau(monitor = "val_dice",
factor = 0.1,
patience = 3,
verbose = 1,
epsilon = 0.0001,
mode = "max"),
callback_model_checkpoint(filepath = "data/ImageMasking/FCN_{epoch:03d}.h5",
monitor = "val_dice",
save_best_only = TRUE,
save_weights_only = TRUE,
mode = "max" )
)
#開始訓練
model %>% fit_generator(
generator = train_iterator,
steps_per_epoch = as.integer(length(train_jpg) / batch_size),
epochs = epochs,
validation_data = test_iterator,
validation_steps = as.integer(length(test_jpg) / batch_size),
callbacks = callbacks_list
)
#close平行運算
stopCluster(cl)
gc()訓練結果如下,在epoch 50終止
使用測試資料預測,中間是原本的mask,右邊是預測結果,原則上看起來很不錯,輪框、天線等細節部份也有呈現出來,不過這畢竟是應用在壓縮圖(224x224)之下的情形
下圖是原圖+mask合成圖
下圖是預測(224x224)再放大到原圖大小(1280x1918),由於放大關係,邊緣線條為鋸齒狀,不過整體覆蓋率還可以
由於carvana資料圖的背景固定,如果model預測到現實圖片結果是如何呢?基於這樣的好奇心,隨選幾張網路圖做了以下預測,預測1(中間圖)是使用之前epoch 50的權重來預測,可以看出來預測結果不是很理想,不過由於base model使用imagenet權重來訓練的關係,背景物體(山、人…)其實是有被認出來的,只是無法得知是否歸屬車或背景,這是受限carvana訓練資料背景單純的影響。因此,後來加入pascal voc中255筆車子資料,base epoch 50權重再訓練一次,得到預測2(右邊圖)的結果,和中間圖相比,顯然很多背景物體已經可以識別出來,不過也許是資料筆數太少、車子樣式不足,車型識別還有很多改善空間
回到carvana競賽主題,提交預測結果的測試資料筆數為100064筆,為節省上傳檔案大小,kaggle要求特定的格式,我的作法是預測、放大、轉換格式,不過要花一些時間處理,以下是格式轉換的函式
#file format
file_format <- function(img_mask) {
mtx <- as.numeric(img_mask[[1]])[,,1]
mtx <- t(mtx)
bi <- ifelse(as.numeric(mtx) >= 0.5, 1, 0)
diff_bi <- c(bi, 0) - c(0, bi)
starts <- which(diff_bi == 1)
ends <- which(diff_bi == -1)
fm <- paste(c(rbind(starts, ends - starts)), collapse = " ")
return(fm)
}最後,試著以原圖的四分之一(640x960),搭配自建的UNet重新訓練、預測,提交所得的分數為0.995746